library(bigMap)
# load aux. stuff
source('./mcsk15.R')

Load data

# first 50 principal components
X <- as.matrix(read.csv('./mcsk15_data.csv.gz'))

ptSNE

# ./mcsk15/start.R

library(bigMap)

X <- as.matrix(read.csv('./mcsk15_data.csv.gz'))

threads <- 40
ppx.list <- round(nrow(X) * c(.005, .01, .05, .10, .20, .30, .40, .50), 0)

# +++ start MPI cluster
mpi.cl <- bdm.mpi.start(threads)
if (is.null(mpi.cl)) return()

# +++ run
m.list <- lapply(ppx.list, function(ppx)
{
        # +++ compute betas
        m <- bdm.init(X, dSet.name = 'mck15', ppx = ppx, threads = threads, mpi.cl = mpi.cl)
        # +++ ptSNE
        m <- bdm.ptsne(NULL, m, lRate = NULL, theta = 0.0, threads = threads, mpi.cl = mpi.cl, layers = 2)
        # +++ EFR
        m.efr <- bdm.efr(NULL, list(m), ppx = ppx, iters = 100, threads = threads, mpi.cl = mpi.cl)
        # +++ EFR (ppx = 45)
        m.efr <- bdm.efr(NULL, list(m), ppx = 45, iters = 100, threads = threads, mpi.cl = mpi.cl)
        # +++ kNP
        m.efr <- lapply(m.efr, function(m) bdm.knp(NULL, m, threads = threads, mpi.cl = mpi.cl))
        # +++ hlC
        m.efr <- lapply(m.efr, function(m) bdm.hlCorr(NULL, m, threads = threads, mpi.cl = mpi.cl))
        #
        m.efr
})

save(m.list, file = './mcsk15_list.RData')

# +++ stop cluster
stopCluster(mpi.cl)

Submit job:

$ qsub -pe make 20 -l h_vmem=4G Rsckt ./mcsk15/start.R

Load output

# load ouput
load('./mcsk15_list.RData')
# pt-SNE embedding
m.list1 <- lapply(m.list, function(m.ppx) m.ppx[[1]])
# pt-SNE+EFC.ppx
m.list2 <- lapply(m.list, function(m.ppx) m.ppx[[2]])
# pt-SNE+EFC.45
m.list3 <- lapply(m.list, function(m.ppx) m.ppx[[3]])

Range of perplexities

sapply(m.list1, function(m) m$ppx$ppx)
## [1]   224   448  2240  4481  8962 13442 17923 22404

Embedding Cost/Size

nulL <- lapply(m.list1, function(m) bdm.cost(m))

Embedding

mcsk15.legend()

# labels
L <- mcsk15.lbls(l = 1)

pt-SNE

nulL <- lapply(m.list1, function(m) {
  m$lbls <- L
  bdm.ptsne.plot(m, class.pltt = MACOSKO_COLORS1, ptsne.cex = 0.3)
})

pt-SNE+EFR.ppx

nulL <- lapply(m.list2, function(m) {
  m$lbls <- L
  bdm.ptsne.plot(m, class.pltt = MACOSKO_COLORS1, ptsne.cex = 0.3)
})

pt-SNE+EFR.45

nulL <- lapply(m.list3, function(m) {
  m$lbls <- L
  bdm.ptsne.plot(m, class.pltt = MACOSKO_COLORS1, ptsne.cex = 0.3)
})

hl-Correlation

pt-SNE

hlTable <- sapply(m.list1, function(m) summary(m$hlC)[4])
hlTable <- matrix(hlTable, nrow = 1)
colnames(hlTable) <- sapply(m.list1, function(m) m$ppx$ppx)
rownames(hlTable) <- c('<hlC>')
knitr::kable(hlTable, caption = 'hl-Correlation') %>%
  kable_styling(full_width = F)
hl-Correlation
224 448 2240 4481 8962 13442 17923 22404
<hlC> 0.1328081 0.0965539 0.1739388 0.1886429 0.1988989 0.314965 0.9076846 0.8988457

PCA (2 first components)

Note the HL-Correlation (~90%) for high perplexities (40%, 50% of data set size) and the similarity between the embedding and the PCA 2 first components plot;

# PCA plot
plot(X[, 1], X[, 2], pch = 15, cex = 0.3, col = MACOSKO_COLORS1[L])
# pt-SNE (ppx=17923)
m <- m.list1[[7]]
m$lbls <- L
bdm.ptsne.plot(m, class.pltt = MACOSKO_COLORS1, ptsne.cex = 0.3)

pt-SNE+EFR.ppx

hlTable <- sapply(m.list2, function(m) summary(m$hlC)[4])
hlTable <- matrix(hlTable, nrow = 1)
colnames(hlTable) <- sapply(1:8, function(i) paste(m.list1[[i]]$ppx$ppx, '+', m.list2[[i]]$ppx$ppx))
rownames(hlTable) <- c('<hlC>')
knitr::kable(hlTable, caption = 'hl-Correlation') %>%
  kable_styling(full_width = F)
hl-Correlation
224 + 224 448 + 448 2240 + 2240 4481 + 4481 8962 + 8962 13442 + 13442 17923 + 17923 22404 + 22404
<hlC> 0.1325114 0.0972726 0.175903 0.1946172 0.2044327 0.3113044 0.9063941 0.8983952

pt-SNE+EFR.45

hlTable <- sapply(m.list3, function(m) mean(m$hlC))
hlTable <- matrix(round(hlTable, 4), nrow = 1)
colnames(hlTable) <- sapply(1:8, function(i) paste(m.list1[[i]]$ppx$ppx, '+', m.list3[[i]]$ppx$ppx))
rownames(hlTable) <- c('<hlC>')
knitr::kable(hlTable, caption = 'hl-Correlation') %>%
  kable_styling(full_width = F)
hl-Correlation
224 + 45 448 + 45 2240 + 45 4481 + 45 8962 + 45 13442 + 45 17923 + 45 22404 + 45
<hlC> 0.1343 0.0978 0.1752 0.1907 0.1858 0.2115 0.1852 0.1905

Kary-neighborhood preservation

bdm.knp.plot(m.list1)
bdm.knp.plot(m.list2)
bdm.knp.plot(m.list3)

pt-SNE vs. pt-SNE+EFR.ppx

nulL <- lapply(1:8, function(i) {
  bdm.knp.plot(list(m.list1[[i]], m.list2[[i]]))
})

pt-SNE vs. pt-SNE+EFR.45

nulL <- lapply(1:8, function(i) {
  bdm.knp.plot(list(m.list1[[i]], m.list3[[i]]))
})

Running Times

rTimes <- sapply(seq_along(m.list), function(i) {
  m1 <- m.list1[[i]]
  m3 <- m.list3[[i]]
  c(m1$ppx$t[[3]], m1$t$epoch, m1$t$ptsne[[3]], sum(m1$ppx$t[[3]]+m1$t$ptsne[[3]]), m3$t$efr[[3]])
})
rTimes <- round(rTimes /60, 2)
colnames(rTimes) <- sapply(m.list1, function(m) m$ppx$ppx)
rownames(rTimes) <- c('betas', 'epoch', 'pt-SNE', 'total', 'EFR.45')
knitr::kable(rTimes, caption = 'Computation times (min)') %>%
  kable_styling(full_width = F)
Computation times (min)
224 448 2240 4481 8962 13442 17923 22404
betas 0.31 0.35 0.50 0.70 1.12 1.09 0.63 0.59
epoch 0.28 0.26 0.26 0.27 0.28 0.27 0.27 0.27
pt-SNE 20.39 18.99 16.51 15.58 14.67 13.43 13.59 13.54
total 20.71 19.34 17.01 16.27 15.79 14.52 14.22 14.13
EFR.45 4.77 4.79 4.80 4.80 4.80 4.79 4.78 4.80

pt-SNE run on: Intel(R) Xeon(R) CPU E5-2650 v3 2.30GHz, 32Mb cache, 41 cores, 4GB/core RAM.

EFR run on: Intel(R) Xeon(R) CPU E5-2650 v3 2.30GHz, 32Mb cache, 20 cores, 4GB/core RAM.